Source code for nlp_architect.pipelines.spacy_np_annotator

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import pickle
from os import path

import numpy as np
import spacy
from spacy.tokens import Doc
from spacy.tokens import Span

from nlp_architect.models.chunker import SequenceChunker
from nlp_architect.utils.generic import pad_sentences
from import validate_existing_filepath
from nlp_architect.utils.text import extract_nps, Stopwords

[docs]class NPAnnotator(object): """ Spacy based NP annotator - uses models.SequenceChunker model for annotation Args: model (SequenceChunker): a chunker model word_vocab (Vocabulary): word-id vocabulary of the model char_vocab (Vocabulary): char id vocabulary of words of the model chunk_vocab (Vocabulary): chunk tag vocabulary of the model batch_size (int, optional): inference batch size """ def __init__(self, model, word_vocab, char_vocab, chunk_vocab, batch_size: int = 32): self.model = model = batch_size self.word_vocab = word_vocab self.char_vocab = char_vocab self.chunk_vocab = chunk_vocab Doc.set_extension("noun_phrases", default=[], force=True)
[docs] @classmethod def load( cls, model_path: str, parameter_path: str, batch_size: int = 32, use_cudnn: bool = False ): """ Load a NPAnnotator annotator Args: model_path (str): path to trained model parameter_path (str): path to model parameters batch_size (int, optional): inference batch_size use_cudnn (bool, optional): use gpu for inference (cudnn cells) Returns: NPAnnotator class with loaded model """ _model_path = path.join(path.dirname(path.realpath(__file__)), model_path) validate_existing_filepath(_model_path) _parameter_path = path.join(path.dirname(path.realpath(__file__)), parameter_path) validate_existing_filepath(_parameter_path) model = SequenceChunker(use_cudnn=use_cudnn) model.load(_model_path) with open(_parameter_path, "rb") as fp: model_params = pickle.load(fp) word_vocab = model_params["word_vocab"] chunk_vocab = model_params["chunk_vocab"] char_vocab = model_params.get("char_vocab", None) return cls(model, word_vocab, char_vocab, chunk_vocab, batch_size)
def _infer_chunks(self, input_vec, doc_lengths): tagged_sents = self.model.predict(input_vec, sentence = [] for c, l in zip(tagged_sents, doc_lengths): sentence.append(c[-l:]) doc = np.concatenate(sentence) chunk_tags = [self.chunk_vocab.id_to_word(w) for w in doc] return extract_nps(chunk_tags) def _feature_extractor(self, doc): features = np.asarray( [self.word_vocab[w] if self.word_vocab[w] is not None else 1 for w in doc] ) if self.char_vocab: sentence_chars = [] for w in doc: word_chars = [] for c in w: _cid = self.char_vocab[c] word_chars.append(_cid if _cid is not None else 1) sentence_chars.append(word_chars) sentence_chars = pad_sentences(sentence_chars, self.model.max_word_len) features = (features, sentence_chars) return features def __call__(self, doc: Doc) -> Doc: """ Annotate the document with noun phrase spans """ spans = [] doc_vecs = [] doc_chars = [] doc_lens = [] if len(doc) < 1: return doc for sentence in doc.sents: features = self._feature_extractor([t.text for t in sentence]) if isinstance(features, tuple): doc_vec = features[0] doc_chars.append(features[1]) else: doc_vec = features doc_vecs.append(doc_vec) doc_lens.append(len(doc_vec)) doc_vectors = pad_sentences(np.asarray(doc_vecs)) inputs = doc_vectors if self.char_vocab: max_len = doc_vectors.shape[1] padded_chars = np.zeros((len(doc_chars), max_len, self.model.max_word_len)) for idx, d in enumerate(doc_chars): d = d[:max_len] padded_chars[idx, -d.shape[0] :] = d inputs = [inputs, padded_chars] np_indexes = self._infer_chunks(inputs, doc_lens) for s, e in np_indexes: np_span = Span(doc, s, e) spans.append(np_span) spans = _NPPostprocessor.process(spans) set_noun_phrases(doc, spans) return doc
[docs]def get_noun_phrases(doc: Doc) -> [Span]: """ Get noun phrase tags from a spacy annotated document. Args: doc (Doc): a spacy type document Returns: a list of noun phrase Span objects """ assert hasattr(doc._, "noun_phrases"), "no noun_phrase attributes in document" return doc._.noun_phrases
[docs]def set_noun_phrases(doc: Doc, nps: [Span]) -> None: """ Set noun phrase tags Args: doc (Doc): a spacy type document nps ([Span]): a list of Spans """ assert hasattr(doc._, "noun_phrases"), "no noun_phrase attributes in document" doc._.set("noun_phrases", nps)
class _NPPostprocessor: @classmethod def process(cls, noun_phrases: [Span]) -> [Span]: new_phrases = [] for phrase in noun_phrases: p = _NPPostprocessor._phrase_process(phrase) if p is not None and len(p) > 0: new_phrases.append(p) return new_phrases @classmethod def _phrase_process(cls, phrase: Span) -> Span: last_phrase = None while phrase != last_phrase: last_phrase = phrase for func_args in post_processing_rules: pf = func_args[0] args = func_args[1:] if len(args) > 0: phrase = pf(phrase, *args) else: phrase = pf(phrase) if phrase is None: break return phrase def _filter_repeating_nonalnum(phrase, length): """ Check if a given phrase has non repeating alphanumeric chars of given length. Example: 'phrase $$$' with length=3 will return False """ if len(phrase) > 0: alnum_len = length for t in phrase: if not t.is_alpha: alnum_len -= 1 else: alnum_len = length if alnum_len == 0: return None return phrase def _filter_long_phrases(phrase, word_length, phrase_length): if ( len(phrase) > 0 and max([len(t) for t in phrase]) > word_length and len(phrase) > phrase_length ): return None return phrase def _remove_non_alphanum_from_start(phrase): if len(phrase) > 1 and not phrase[0].is_alpha: phrase = phrase[1:] return phrase def _remove_non_alphanum_from_end(phrase): if len(phrase) > 1 and not phrase[-1].is_alpha: phrase = phrase[:-1] return phrase def _remove_stop_words(phrase): while len(phrase) > 0 and ( phrase[0].is_stop or str(phrase[0]).strip().lower() in Stopwords.get_words() ): phrase = phrase[1:] while len(phrase) > 0 and ( phrase[-1].is_stop or str(phrase[-1]).strip().lower() in Stopwords.get_words() ): phrase = phrase[:-1] return phrase def _remove_char_at_start(phrase): chars = ["@", "-", "=", ".", ":", "+", "?", "nt", '"', "'", "'S", "'s", ","] if phrase and len(phrase) > 0: while len(phrase) > 0 and phrase[0].text in chars: phrase = phrase[1:] return phrase def _remove_char_at_end(phrase): chars = [",", "(", ")", " ", "-"] if phrase: while len(phrase) > 0 and phrase[-1].text in chars: phrase = phrase[:-1] return phrase def _remove_pos_from_start(phrase): tag_list = ["WDT", "PRP$", ":"] pos_list = ["PUNCT", "INTJ", "NUM", "PART", "ADV", "DET", "PRON", "VERB"] if phrase: while len(phrase) > 0 and (phrase[0].pos_ in pos_list or phrase[0].tag_ in tag_list): phrase = phrase[1:] return phrase def _remove_pos_from_end(phrase): tag_list = ["WDT", ":"] pos_list = ["DET", "PUNCT", "CONJ"] if phrase: while len(phrase) > 0 and (phrase[-1].pos_ in pos_list or phrase[-1].tag_ in tag_list): phrase = phrase[:-1] return phrase def _filter_single_pos(phrase): pos_list = ["VERB", "ADJ", "ADV"] if phrase and len(phrase) == 1 and phrase[0].pos_ in pos_list: return None return phrase def _filter_fp_nums(phrase): if len(phrase) > 0: try: # check for float number float(phrase.text.replace(",", "")) return None except ValueError: return phrase return phrase def _filter_single_char(phrase): if phrase and len(phrase) == 1 and len(phrase[0]) == 1: return None return phrase def _filter_empty(phrase): if ( phrase is None or len(phrase) == 0 or len(phrase.text) == 0 or len(str(phrase.text).strip()) == 0 ): return None return phrase post_processing_rules = [ (_filter_single_char,), (_filter_single_pos,), (_remove_pos_from_start,), (_remove_pos_from_end,), (_remove_stop_words,), (_remove_non_alphanum_from_start,), (_remove_non_alphanum_from_end,), (_filter_repeating_nonalnum, 5), (_filter_long_phrases, 5, 75), (_remove_char_at_start,), (_remove_char_at_end,), (_filter_fp_nums,), (_filter_empty,), ]
[docs]class SpacyNPAnnotator(object): """ Simple Spacy pipe with NP extraction annotations """ def __init__(self, model_path, settings_path, spacy_model="en", batch_size=32, use_cudnn=False): _model_path = path.join(path.dirname(path.realpath(__file__)), model_path) validate_existing_filepath(_model_path) _settings_path = path.join(path.dirname(path.realpath(__file__)), settings_path) validate_existing_filepath(_settings_path) nlp = spacy.load(spacy_model) for p in nlp.pipe_names: if p not in ["tagger"]: nlp.remove_pipe(p) nlp.add_pipe(nlp.create_pipe("sentencizer"), first=True) nlp.add_pipe( NPAnnotator.load( _model_path, settings_path, batch_size=batch_size, use_cudnn=use_cudnn ), last=True, ) self.nlp = nlp def __call__(self, text: str) -> [str]: """ Parse a given text and return a list of noun phrases found Args: text (str): a text string Returns: list of noun phrases as strings """ return [np.text for np in get_noun_phrases(self.nlp(text))]